import libraries¶

In [1]:
import pandas as pd
import numpy as np

# for data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

import dataset¶

In [2]:
import pandas as pd
df=pd.read_csv(r'C:\Users\SACHIN\OneDrive\Documents\ML MODELS DATA\shopping_trends.csv')
df.head()
Out[2]:
Customer ID Age Gender Item Purchased Category Purchase Amount (USD) Location Size Color Season Review Rating Subscription Status Payment Method Shipping Type Discount Applied Promo Code Used Previous Purchases Preferred Payment Method Frequency of Purchases
0 1 55 Male Blouse Clothing 53 Kentucky L Gray Winter 3.1 Yes Credit Card Express Yes Yes 14 Venmo Fortnightly
1 2 19 Male Sweater Clothing 64 Maine L Maroon Winter 3.1 Yes Bank Transfer Express Yes Yes 2 Cash Fortnightly
2 3 50 Male Jeans Clothing 73 Massachusetts S Maroon Spring 3.1 Yes Cash Free Shipping Yes Yes 23 Credit Card Weekly
3 4 21 Male Sandals Footwear 90 Rhode Island M Maroon Spring 3.5 Yes PayPal Next Day Air Yes Yes 49 PayPal Weekly
4 5 45 Male Blouse Clothing 49 Oregon M Turquoise Spring 2.7 Yes Cash Free Shipping Yes Yes 31 PayPal Annually
In [3]:
df.shape
Out[3]:
(3900, 19)
In [6]:
df.info()  # data information
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3900 entries, 0 to 3899
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Customer ID               3900 non-null   int64  
 1   Age                       3900 non-null   int64  
 2   Gender                    3900 non-null   object 
 3   Item Purchased            3900 non-null   object 
 4   Category                  3900 non-null   object 
 5   Purchase Amount (USD)     3900 non-null   int64  
 6   Location                  3900 non-null   object 
 7   Size                      3900 non-null   object 
 8   Color                     3900 non-null   object 
 9   Season                    3900 non-null   object 
 10  Review Rating             3900 non-null   float64
 11  Subscription Status       3900 non-null   object 
 12  Payment Method            3900 non-null   object 
 13  Shipping Type             3900 non-null   object 
 14  Discount Applied          3900 non-null   object 
 15  Promo Code Used           3900 non-null   object 
 16  Previous Purchases        3900 non-null   int64  
 17  Preferred Payment Method  3900 non-null   object 
 18  Frequency of Purchases    3900 non-null   object 
dtypes: float64(1), int64(4), object(14)
memory usage: 579.0+ KB
In [7]:
# cheack null values
df.isnull().sum()
Out[7]:
Customer ID                 0
Age                         0
Gender                      0
Item Purchased              0
Category                    0
Purchase Amount (USD)       0
Location                    0
Size                        0
Color                       0
Season                      0
Review Rating               0
Subscription Status         0
Payment Method              0
Shipping Type               0
Discount Applied            0
Promo Code Used             0
Previous Purchases          0
Preferred Payment Method    0
Frequency of Purchases      0
dtype: int64

Gender¶

In [7]:
df['Gender'].value_counts()
Out[7]:
Gender
Male      2652
Female    1248
Name: count, dtype: int64
In [25]:
x=df['Gender'].value_counts()
plt.figure(figsize=(8,6))
sns.barplot(x=x.index, y=x.values, palette=colors)
plt.xlabel("Gender")
plt.ylabel('Counts')
plt.xticks(rotation=45)
plt.legend(title='Gender')
for index, value in enumerate(x.values):
    plt.text(index, value+2.3, str(value), color='black', ha="right", size='small')
plt.show()
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
No description has been provided for this image

Category¶

In [26]:
df['Category'].value_counts()
Out[26]:
Category
Clothing       1737
Accessories    1240
Footwear        599
Outerwear       324
Name: count, dtype: int64
In [27]:
x=df['Category'].value_counts()
plt.figure(figsize=(8,5))
sns.barplot(x=x.index, y=x.values)
plt.xlabel("Category")
plt.ylabel('COUNTS')
plt.xticks(rotation=45)
plt.show()
No description has been provided for this image
In [34]:
x=df['Category'].value_counts()
labels=x.index
explode=[0.1,0.1,0,0]
plt.pie(x,startangle=45,rotatelabels=True,labels=labels,autopct='%.f%%',explode=explode)    
plt.title('category size distribution')
plt.show()
No description has been provided for this image

Location¶

In [35]:
df['Location'].value_counts()
Out[35]:
Location
Montana           96
California        95
Idaho             93
Illinois          92
Alabama           89
Minnesota         88
Nebraska          87
New York          87
Nevada            87
Maryland          86
Delaware          86
Vermont           85
Louisiana         84
North Dakota      83
Missouri          81
West Virginia     81
New Mexico        81
Mississippi       80
Indiana           79
Georgia           79
Kentucky          79
Arkansas          79
North Carolina    78
Connecticut       78
Virginia          77
Ohio              77
Tennessee         77
Texas             77
Maine             77
South Carolina    76
Colorado          75
Oklahoma          75
Wisconsin         75
Oregon            74
Pennsylvania      74
Washington        73
Michigan          73
Alaska            72
Massachusetts     72
Wyoming           71
Utah              71
New Hampshire     71
South Dakota      70
Iowa              69
Florida           68
New Jersey        67
Hawaii            65
Arizona           65
Kansas            63
Rhode Island      63
Name: count, dtype: int64
In [37]:
x=df['Location'].value_counts()
plt.figure(figsize=(11,6))
plt.scatter(x.index,y=x.values)
plt.xlabel('locations')
plt.ylabel('counts')
plt.title('Different locations')
plt.xticks(rotation=90)
plt.show()
No description has been provided for this image
In [39]:
x=df['Location'].value_counts()
plt.figure(figsize=(10,6))
plt.bar(x.index,height=x.values)
plt.xlabel('locations')
plt.ylabel('counts')
plt.title('Different locations')
plt.xticks(rotation=90)
plt.show()
No description has been provided for this image

Size and Season¶

In [55]:
df.head(2)
Out[55]:
Customer ID Age Gender Item Purchased Category Purchase Amount (USD) Location Size Color Season Review Rating Subscription Status Payment Method Shipping Type Discount Applied Promo Code Used Previous Purchases Preferred Payment Method Frequency of Purchases
0 1 55 Male Blouse Clothing 53 Kentucky L Gray Winter 3.1 Yes Credit Card Express Yes Yes 14 Venmo Fortnightly
1 2 19 Male Sweater Clothing 64 Maine L Maroon Winter 3.1 Yes Bank Transfer Express Yes Yes 2 Cash Fortnightly
In [44]:
x=df["Size"].value_counts()
print(x)
Size
M     1755
L     1053
S      663
XL     429
Name: count, dtype: int64
In [45]:
y=df["Season"].value_counts()
print(y)
Season
Spring    999
Fall      975
Winter    971
Summer    955
Name: count, dtype: int64
In [46]:
df = pd.DataFrame({
    'Size': x.index,
    'Count_X': x.values,
    'Season': y.index,
    'Count_Y': y.values
})
plt.figure(figsize=(6,4))
sns.scatterplot(data=df, x='Count_X', y='Count_Y', hue='Season', palette='viridis', s=170)
plt.xlabel('Size')
plt.ylabel('Season')
plt.title('Scatter Plot of Size vs Season')
plt.legend(title='Season')
plt.show()
No description has been provided for this image

Item Purchased¶

In [56]:
x=df['Item Purchased'].value_counts()
x
Out[56]:
Item Purchased
Blouse        171
Jewelry       171
Pants         171
Shirt         169
Dress         166
Sweater       164
Jacket        163
Belt          161
Sunglasses    161
Coat          161
Sandals       160
Socks         159
Skirt         158
Shorts        157
Scarf         157
Hat           154
Handbag       153
Hoodie        151
Shoes         150
T-shirt       147
Sneakers      145
Boots         144
Backpack      143
Gloves        140
Jeans         124
Name: count, dtype: int64
In [59]:
plt.figure(figsize=(14,8))
plt.bar(x.index,height=x.values)
plt.xlabel('item purchased')
plt.ylabel('counts')
plt.xticks(rotation=45)
for index, value in enumerate(x.values):
    plt.text(index,value+2, str(value), color='green', ha="center", size='small')
plt.show()
No description has been provided for this image

Relation between item purchased and size¶

In [60]:
data=df.groupby('Item Purchased')['Size'].value_counts().unstack()
print(data)
Size             L   M   S  XL
Item Purchased                
Backpack        35  76  18  14
Belt            39  66  37  19
Blouse          46  75  29  21
Boots           40  70  21  13
Coat            45  66  36  14
Dress           47  77  27  15
Gloves          35  66  22  17
Handbag         34  72  29  18
Hat             41  67  23  23
Hoodie          40  68  26  17
Jacket          48  82  20  13
Jeans           39  41  26  18
Jewelry         39  77  37  18
Pants           46  80  25  20
Sandals         39  75  29  17
Scarf           45  65  28  19
Shirt           41  86  20  22
Shoes           47  66  22  15
Shorts          46  67  27  17
Skirt           53  67  27  11
Sneakers        46  56  21  22
Socks           40  74  26  19
Sunglasses      39  73  36  13
Sweater         42  77  27  18
T-shirt         41  66  24  16
In [64]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(16,14))
data_melted = data.reset_index().melt(id_vars='Item Purchased', var_name='Size', value_name='Count')
colors = sns.color_palette("Set2", len(x))
sns.barplot(x='Item Purchased', y='Count', hue='Size', data=data_melted, palette=colors)
plt.title('Distribution of Item Purchased by Size')
plt.xlabel('Item Purchased')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.legend(title='Size', bbox_to_anchor=(0, 1))
plt.show()
No description has been provided for this image
In [67]:
plt.figure(figsize=(16,14))
data_melted = data.reset_index().melt(id_vars='Item Purchased', var_name='Size', value_name='Count')
colors = sns.color_palette("Set2", len(x))
sns.barplot(x='Item Purchased', y='Count', hue='Size', data=data_melted, palette=colors)
plt.title('Distribution of Item Purchased by Size')
plt.xlabel('Item Purchased')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.legend(title='Size', bbox_to_anchor=(0, 1))
for p in plt.gca().patches:
    plt.text(p.get_x() + p.get_width() / 2, p.get_height() + 1, 
             '{:1.0f}'.format(p.get_height()), ha='center', va='center')
plt.show()
No description has been provided for this image

relation between Size and Gender¶

In [68]:
data2=df.groupby(['Size','Gender']).size().unstack()
data2
Out[68]:
Gender Female Male
Size
L 337 716
M 590 1165
S 187 476
XL 134 295
In [69]:
plt.figure(figsize=(16,12))
data_melted=data2.reset_index().melt(id_vars='Size',var_name='Gender',value_name='Count')
sns.barplot(data=data_melted,x='Gender',y='Count',hue='Size',palette=colors)
plt.title('Gender Across Different Size')
plt.xlabel('Gender')
plt.ylabel('COUNTS')
plt.show()
No description has been provided for this image
In [81]:
plt.figure(figsize=(16,12))
data_melted=data2.reset_index().melt(id_vars='Size',var_name='Gender',value_name='Count')
sns.barplot(data=data_melted,x='Gender',y='Count',hue='Size',palette=colors)
plt.title('Gender Across Different Size')
plt.xlabel('Gender')
plt.ylabel('COUNTS')
for p in plt.gca().patches:
    plt.text(p.get_x() + p.get_width() / 2, p.get_height() + 10, 
             '{:1.0f}'.format(p.get_height()), ha='right', va='center')
plt.show()
No description has been provided for this image

category wise subscription status¶

In [82]:
x=df.groupby(['Category','Subscription Status']).size().unstack()
x
Out[82]:
Subscription Status No Yes
Category
Accessories 906 334
Clothing 1280 457
Footwear 428 171
Outerwear 233 91
In [83]:
plt.figure(figsize=(16,10))
data_melt=x.reset_index().melt(id_vars='Category',var_name='Subscription Status',value_name='Count')
sns.barplot(data=data_melt,x='Subscription Status',y='Count',hue='Category')
plt.title('Subscription Status across differnt Category')
plt.xlabel('Subscription Status')
plt.ylabel('count')
plt.xticks(rotation=30)
plt.show()
No description has been provided for this image
In [86]:
plt.figure(figsize=(16,10))
data_melt=x.reset_index().melt(id_vars='Category',var_name='Subscription Status',value_name='Count')
sns.barplot(data=data_melt,x='Subscription Status',y='Count',hue='Category')
plt.title('Subscription Status across differnt Category')
plt.xlabel('Subscription Status')
plt.ylabel('count')
plt.xticks(rotation=30)
for p in plt.gca().patches:
    plt.text(p.get_x() + p.get_width() / 2, p.get_height() + 14, 
             '{:1.0f}'.format(p.get_height()), ha='right', va='center')
plt.show()
No description has been provided for this image
In [87]:
import matplotlib.pyplot as plt
plt.figure(figsize=(16, 10))
x.plot(kind='box', stacked=True, figsize=(16, 10), colormap='tab20')
plt.title('Subscription Status across different categories')
plt.xlabel('Category')
plt.ylabel('Count')
plt.xticks(rotation=30)
plt.legend(title='Subscription Status')
plt.show()
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
<Figure size 1600x1000 with 0 Axes>
No description has been provided for this image
In [97]:
import matplotlib.pyplot as plt
plt.figure(figsize=(16, 10))
ax=x.plot(kind='bar', stacked=True, figsize=(16, 10), colormap='tab20')
plt.title('Subscription Status across different categories')
plt.xlabel('Category')
plt.ylabel('Count')
plt.xticks(rotation=360)
plt.legend(title='Subscription Status')
plt.show()
<Figure size 1600x1000 with 0 Axes>
No description has been provided for this image
In [96]:
import matplotlib.pyplot as plt
plt.figure(figsize=(16, 10))
ax=x.plot(kind='bar', stacked=True, figsize=(16, 10), colormap='tab20')
plt.title('Subscription Status across different categories')
plt.xlabel('Category')
plt.ylabel('Count')
plt.xticks(rotation=360)
plt.legend(title='Subscription Status')
for container in ax.containers:
    ax.bar_label(container, label_type='center', fmt='%d')
plt.show()
<Figure size 1600x1000 with 0 Axes>
No description has been provided for this image

Payment Method with respect to Discount Applied¶

In [98]:
x=df.groupby(['Payment Method','Discount Applied']).size().unstack()
x
Out[98]:
Discount Applied No Yes
Payment Method
Bank Transfer 361 271
Cash 358 290
Credit Card 404 292
Debit Card 363 270
PayPal 372 266
Venmo 365 288
In [100]:
data_melt=x.reset_index().melt(id_vars='Payment Method',var_name='Discount Applied',value_name='Counts')
sns.barplot(data=data_melt,x='Payment Method',y='Counts',hue='Discount Applied')
plt.title('Payment Method effects Discount Applied')
plt.xlabel('Payment Method')
plt.ylabel('Counts')
plt.xticks(rotation=30)
plt.show()
No description has been provided for this image
In [101]:
ax=x.plot(kind='bar',stacked=True,colormap='tab20')
for container in ax.containers:
    ax.bar_label(container, label_type='center', fmt='%d')
plt.show()
No description has been provided for this image
In [104]:
x=df['Payment Method'].value_counts()
print(x)
Payment Method
Credit Card      696
Venmo            653
Cash             648
PayPal           638
Debit Card       633
Bank Transfer    632
Name: count, dtype: int64
In [105]:
y=df['Discount Applied'].value_counts()
print(y)
Discount Applied
No     2223
Yes    1677
Name: count, dtype: int64

Demographic Analysis¶

1. Age Distribution¶

a.What is the distribution of customer ages?

In [72]:
age=df['Age'].value_counts().sort_values()
age
Out[72]:
Age
44    51
67    54
20    62
33    63
60    65
61    65
22    66
70    67
39    68
24    68
48    68
34    68
18    69
26    69
21    69
53    70
38    70
23    71
30    71
66    71
47    71
45    72
40    72
65    72
51    72
35    72
64    73
55    73
52    73
36    74
56    74
63    75
59    75
68    75
29    76
46    76
37    77
31    79
28    79
43    79
42    80
19    81
58    81
32    82
62    83
27    83
54    83
50    83
49    84
25    85
41    86
57    87
69    88
Name: count, dtype: int64
In [19]:
df['Age'].describe()
Out[19]:
count    3900.000000
mean       44.068462
std        15.207589
min        18.000000
25%        31.000000
50%        44.000000
75%        57.000000
max        70.000000
Name: Age, dtype: float64
In [92]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(10, 6))
sns.histplot(df['Age'], kde=True, color='blue')
plt.title('Age Distribution of Customers', fontsize=16)
plt.xlabel('Age', fontsize=12)
plt.ylabel('Counts', fontsize=12)
plt.show()
C:\Users\Divya\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning:

use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.

No description has been provided for this image

b. Is there a difference in the age distribution between genders?

In [35]:
df.groupby('Gender')['Age'].value_counts()
Out[35]:
Gender  Age
Female  45     33
        52     32
        57     32
        62     32
        31     30
               ..
Male    67     40
        33     40
        45     39
        22     38
        44     28
Name: count, Length: 106, dtype: int64
In [21]:
df.groupby('Gender')['Age'].describe()
Out[21]:
count mean std min 25% 50% 75% max
Gender
Female 1248.0 44.007212 14.953843 18.0 31.0 44.0 57.0 70.0
Male 2652.0 44.097285 15.328257 18.0 31.0 44.0 57.0 70.0
In [32]:
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='Gender', y='Age', palette='viridis')
plt.title('Age Distribution by Gender', fontsize=16)
plt.xlabel('Gender', fontsize=12)
plt.ylabel('Age', fontsize=12)
plt.show()
No description has been provided for this image

2. Gender Distribution:¶

a.What is the proportion of males and females in the dataset?

In [45]:
df['Gender'].value_counts()*100/3900
Out[45]:
Gender
Male      68.0
Female    32.0
Name: count, dtype: float64

3. Location Analysis:¶

a.Which locations have the most customers?

In [29]:
x=df['Location'].value_counts()
x.head(10)
Out[29]:
Location
Montana       96
California    95
Idaho         93
Illinois      92
Alabama       89
Minnesota     88
Nebraska      87
New York      87
Nevada        87
Maryland      86
Name: count, dtype: int64
In [30]:
plt.figure(figsize=(12, 6))
sns.barplot(x=x.index, y=x.values, palette='viridis')
plt.title('Top 10 Locations with Most Customers', fontsize=16)
plt.xlabel('Location', fontsize=12)
plt.ylabel('Number of Customers', fontsize=12)
plt.xticks(rotation=45)
for index, value in enumerate(x.values):
    plt.text(index,value+2, str(value), color='green', ha="center", size='small')
plt.show()
No description has been provided for this image

Purchase Behavior¶

4. Purchase Amount Analysis:¶

a.What is the distribution of purchase amounts?

In [78]:
x=df['Purchase Amount (USD)'].value_counts().reset_index(inplace=False)
x
Out[78]:
Purchase Amount (USD) count
0 94 62
1 32 62
2 36 62
3 51 61
4 90 60
... ... ...
76 100 36
77 87 35
78 49 35
79 69 34
80 61 33

81 rows × 2 columns

In [83]:
df['Purchase Amount (USD)'].max()
Out[83]:
100
In [87]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x=x['Purchase Amount (USD)'],y=x['count'])
plt.title('Purchase Amount (USD) Distribution', fontsize=16)
plt.xlabel('Purchase Amount (USD)', fontsize=12)
plt.ylabel('Counts', fontsize=12)
plt.show()
No description has been provided for this image
In [74]:
df.head()
Out[74]:
Customer ID Age Gender Item Purchased Category Purchase Amount (USD) Location Size Color Season Review Rating Subscription Status Payment Method Shipping Type Discount Applied Promo Code Used Previous Purchases Preferred Payment Method Frequency of Purchases
0 1 55 Male Blouse Clothing 53 Kentucky L Gray Winter 3.1 Yes Credit Card Express Yes Yes 14 Venmo Fortnightly
1 2 19 Male Sweater Clothing 64 Maine L Maroon Winter 3.1 Yes Bank Transfer Express Yes Yes 2 Cash Fortnightly
2 3 50 Male Jeans Clothing 73 Massachusetts S Maroon Spring 3.1 Yes Cash Free Shipping Yes Yes 23 Credit Card Weekly
3 4 21 Male Sandals Footwear 90 Rhode Island M Maroon Spring 3.5 Yes PayPal Next Day Air Yes Yes 49 PayPal Weekly
4 5 45 Male Blouse Clothing 49 Oregon M Turquoise Spring 2.7 Yes Cash Free Shipping Yes Yes 31 PayPal Annually
In [90]:
import plotly.express as px

fig=px.scatter(x,x=x['Purchase Amount (USD)'],y=x['count'],
              title=("purchase amount distribution"),height=600)
fig.show()
In [91]:
import plotly.express as px

fig=px.histogram(df,df['Purchase Amount (USD)'],
              title=("purchase amount distribution"),height=600)
fig.show()
In [93]:
import plotly.express as px

fig=px.histogram(df,df['Age'],
              title=("purchase amount distribution"),height=600)
fig.show()
In [94]:
df['Age'].value_counts()
Out[94]:
Age
69    88
57    87
41    86
25    85
49    84
50    83
54    83
27    83
62    83
32    82
19    81
58    81
42    80
43    79
28    79
31    79
37    77
46    76
29    76
68    75
59    75
63    75
56    74
36    74
55    73
52    73
64    73
35    72
51    72
65    72
40    72
45    72
47    71
66    71
30    71
23    71
38    70
53    70
18    69
21    69
26    69
34    68
48    68
24    68
39    68
70    67
22    66
61    65
60    65
33    63
20    62
67    54
44    51
Name: count, dtype: int64

b. Are there differences in purchase amounts between genders?

In [101]:
df.groupby('Gender')['Purchase Amount (USD)'].value_counts()
Out[101]:
Gender  Purchase Amount (USD)
Female  82                       24
        32                       22
        52                       22
        99                       21
        70                       21
                                 ..
Male    70                       23
        65                       23
        61                       23
        52                       22
        87                       22
Name: count, Length: 162, dtype: int64
In [100]:
df.groupby('Gender')['Purchase Amount (USD)'].describe()
Out[100]:
count mean std min 25% 50% 75% max
Gender
Female 1248.0 60.249199 23.420556 20.0 40.0 60.0 81.0 100.0
Male 2652.0 59.536199 23.809976 20.0 38.0 60.0 80.0 100.0
In [104]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='Gender', y='Purchase Amount (USD)', data=df, palette='viridis')
plt.title('Distribution of Purchase Amounts by Gender', fontsize=14)
plt.xlabel('Gender', fontsize=12)
plt.ylabel('Purchase Amount (USD)', fontsize=12)
plt.grid(axis='y')
plt.show()
No description has been provided for this image

5. Category Popularity:¶

a.Which product categories are purchased the most?

In [20]:
x=df['Category'].value_counts()
x
Out[20]:
Category
Clothing       1737
Accessories    1240
Footwear        599
Outerwear       324
Name: count, dtype: int64
In [21]:
plt.figure(figsize=(10, 6))
sns.barplot(x=x.index, y=x.values, palette='viridis')
plt.title('Product Categories', fontsize=14)
plt.xlabel('Number of Purchases', fontsize=12)
plt.ylabel('Category', fontsize=12)
for index, value in enumerate(x.values):
    plt.text(index, value + 10, str(value), ha='center', fontsize=11)
plt.show()
No description has been provided for this image
In [26]:
x=df['Category'].value_counts().reset_index()
x
Out[26]:
Category count
0 Clothing 1737
1 Accessories 1240
2 Footwear 599
3 Outerwear 324
In [27]:
import plotly.express as px

fig=px.bar(x,x['Category'],y=x['count'],
              title=("Category Popularity"),color='Category',height=600)
fig.show()

6. Item Analysis:¶

a.What are the most purchased items in each category?

In [6]:
x = df.groupby(['Category', 'Item Purchased']).size().reset_index(name='Count')
x
Out[6]:
Category Item Purchased Count
0 Accessories Backpack 143
1 Accessories Belt 161
2 Accessories Gloves 140
3 Accessories Handbag 153
4 Accessories Hat 154
5 Accessories Jewelry 171
6 Accessories Scarf 157
7 Accessories Sunglasses 161
8 Clothing Blouse 171
9 Clothing Dress 166
10 Clothing Hoodie 151
11 Clothing Jeans 124
12 Clothing Pants 171
13 Clothing Shirt 169
14 Clothing Shorts 157
15 Clothing Skirt 158
16 Clothing Socks 159
17 Clothing Sweater 164
18 Clothing T-shirt 147
19 Footwear Boots 144
20 Footwear Sandals 160
21 Footwear Shoes 150
22 Footwear Sneakers 145
23 Outerwear Coat 161
24 Outerwear Jacket 163
In [7]:
y = x.loc[x.groupby('Category')['Count'].idxmax()]
y
Out[7]:
Category Item Purchased Count
5 Accessories Jewelry 171
8 Clothing Blouse 171
20 Footwear Sandals 160
24 Outerwear Jacket 163
In [14]:
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

plt.figure(figsize=(8, 5))
plt.bar(y['Category'], y['Count'], color='skyblue', width=0.5)
plt.title('Most Purchased Items by Category (Matplotlib)', fontsize=14)
plt.ylabel('Count', fontsize=12)
plt.xlabel('Category', fontsize=12)
plt.xticks(rotation=360)
plt.tight_layout()
for i, value in enumerate(y['Count']):
    plt.text(i, value + 2, str(value), ha='center', fontsize=10)
plt.show()
No description has been provided for this image
In [34]:
plt.figure(figsize=(8, 5))
sns.barplot(x=y['Category'], y=y['Count'], palette='viridis',width=0.5)
plt.title('Most Purchased Items by Category (Seaborn)', fontsize=14)
plt.ylabel('Count', fontsize=12)
plt.xlabel('Category', fontsize=12)
plt.xticks(rotation=360)
plt.tight_layout()
for i, value in enumerate(y['Count']):
    plt.text(i, value + 2, str(value), ha='center', fontsize=10)
plt.show()
No description has been provided for this image
In [23]:
fig = px.bar(
    x=y['Category'],
    y=y['Count'],
    text=y['Count'],
    labels={'x': 'Category', 'y': 'Count'},
    title='Most Purchased Items by Category (Plotly)',
    color=y['Category'], width=800, height=500)
fig.show()
In [24]:
fig = px.bar(
    x=y['Category'],
    y=y['Count'],
    labels={'x': 'Category', 'y': 'Count'},
    title='Most Purchased Items by Category (Plotly)',
    color=y['Category'], width=800, height=500)
fig.show()

7. Size and Color Preferences:¶

a.What sizes and colors are the most popular among customers?

In [131]:
df.head(2)
Out[131]:
Customer ID Age Gender Item Purchased Category Purchase Amount (USD) Location Size Color Season Review Rating Subscription Status Payment Method Shipping Type Discount Applied Promo Code Used Previous Purchases Preferred Payment Method Frequency of Purchases
0 1 55 Male Blouse Clothing 53 Kentucky L Gray Winter 3.1 Yes Credit Card Express Yes Yes 14 Venmo Fortnightly
1 2 19 Male Sweater Clothing 64 Maine L Maroon Winter 3.1 Yes Bank Transfer Express Yes Yes 2 Cash Fortnightly
In [45]:
size_popularity = df['Size'].value_counts()
size_popularity
Out[45]:
Size
M     1755
L     1053
S      663
XL     429
Name: count, dtype: int64
In [46]:
color_popularity = df['Color'].value_counts()
color_popularity
Out[46]:
Color
Olive        177
Yellow       174
Silver       173
Teal         172
Green        169
Black        167
Cyan         166
Violet       166
Gray         159
Maroon       158
Orange       154
Charcoal     153
Pink         153
Magenta      152
Blue         152
Purple       151
Peach        149
Red          148
Beige        147
Indigo       147
Lavender     147
Turquoise    145
White        142
Brown        141
Gold         138
Name: count, dtype: int64
In [58]:
ax=df['Size'].value_counts()
labels=ax.index
explode=[0.1,0.1,0,0]
plt.pie(ax,startangle=45,rotatelabels=True,labels=labels,autopct='%.f%%',explode=explode)    
plt.title('Popular Size')
plt.show()
No description has been provided for this image
In [39]:
plt.figure(figsize=(12, 6))
sns.barplot(x=size_popularity.index, y=size_popularity.values, palette='viridis', width=0.5)
plt.title('Popularity of Sizes', fontsize=14)
plt.xlabel('Size', fontsize=12)
plt.ylabel('Number of Purchases', fontsize=12)
plt.tight_layout()
for index, value in enumerate(size_popularity.values):
    plt.text(index, value + 0.5, str(value), color='black', ha='center', fontsize=10)
plt.show()
No description has been provided for this image
In [40]:
sns.barplot(x=color_popularity.index, y=color_popularity.values, palette='viridis')
plt.title('Popularity of Colors', fontsize=14)
plt.ylabel('Number of Purchases', fontsize=12)
plt.xlabel('Color', fontsize=12)
plt.xticks(rotation=90)
plt.tight_layout()
for index, value in enumerate(color_popularity.values):
    plt.text(index, value + 0.5, str(value), color='black', ha='center', fontsize=6)
plt.show()
No description has been provided for this image
In [41]:
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.barplot(x=size_popularity.index, y=size_popularity.values, palette='viridis')
plt.title('Popularity of Sizes', fontsize=14)
plt.xlabel('Size', fontsize=12)
plt.ylabel('Number of Purchases', fontsize=12)

plt.subplot(1, 2, 2)
sns.barplot(x=color_popularity.index, y=color_popularity.values, palette='viridis')
plt.title('Popularity of Colors', fontsize=14)
plt.ylabel('Number of Purchases', fontsize=12)
plt.xlabel('Color', fontsize=12)
plt.xticks(rotation=90)

plt.tight_layout()
plt.show()
No description has been provided for this image

b. Is there a correlation between age and size preferences?

In [69]:
x=df.groupby('Size')['Age'].mean()
x
Out[69]:
Size
L     44.571700
M     44.025641
S     43.865762
XL    43.321678
Name: Age, dtype: float64
In [61]:
plt.figure(figsize=(8, 6))
sns.barplot(x=x.index, y=x.values, palette="viridis", width=0.5)
plt.title('Average Age by Size Preference', fontsize=14)
plt.xlabel('Size', fontsize=12)
plt.ylabel('Average Age', fontsize=12)
for index, value in enumerate(x.values):
    plt.text(index, value + 0.5, f"{value:.2f}", color='black', ha='center', fontsize=10)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [62]:
df['Size_Numeric'] = df['Size'].map({'S': 1, 'M': 2, 'L': 3, 'XL': 4})
In [63]:
correlation = df['Age'].corr(df['Size_Numeric'])
correlation
Out[63]:
0.0004505343087438301
In [66]:
plt.figure(figsize=(8, 6))
sns.regplot(x='Age', y='Size_Numeric', data=df)
plt.title('Relationship Between Age and Size Preferences', fontsize=14)
plt.xlabel('Age', fontsize=12)
plt.ylabel('Size (Numeric)', fontsize=12)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [159]:
sns.barplot(x='Age', y='Size_Numeric', data=df)
plt.title('Age vs. Size Preferences')
plt.xlabel('Age')
plt.ylabel('Size (Numeric)')
plt.xticks(rotation=90)
plt.show()
No description has been provided for this image
In [166]:
fig = px.box(df, x='Size', y='Age', title='Age Distribution by Size Preferences',
                labels={'Age': 'Age', 'Size': 'Size'}, color='Size', template='plotly')
fig.show()

Temporal Trends¶

8. Seasonal Preferences:¶

a.What are the most purchased items in each season?

In [76]:
season_item = df.groupby(['Season', 'Item Purchased']).size().reset_index(name='Count')
most_purchased_items = season_item.loc[season_item.groupby('Season')['Count'].idxmax()].reset_index(drop=True)
most_purchased_items
Out[76]:
Season Item Purchased Count
0 Fall Jacket 54
1 Spring Sweater 52
2 Summer Pants 50
3 Winter Sunglasses 52
In [126]:
plt.figure(figsize=(10, 6))
sns.barplot(data=most_purchased_items, x='Season', y='Count', hue='Item Purchased', palette="viridis")
plt.title('Most Purchased Items in Each Season', fontsize=14)
plt.xlabel('Season', fontsize=12)
plt.ylabel('Purchase Count', fontsize=12)
plt.legend(title='Item Purchased', loc='upper left', bbox_to_anchor=(1, 1))
for container in plt.gca().containers:
    plt.bar_label(container, fmt='%.0f', label_type='edge', fontsize=10, color='black')
plt.show()
No description has been provided for this image
In [83]:
plt.figure(figsize=(8, 8))
explode=[0.1,0.1,0,0]
plt.pie(most_purchased_items['Count'], explode=explode, labels=most_purchased_items['Item Purchased'], autopct='%1.1f%%', startangle=140)
plt.title("Item Purchase Distribution", fontsize=14)
plt.show()
No description has been provided for this image

b. How does the purchase amount vary by season?

In [90]:
avg= df.groupby('Season')['Purchase Amount (USD)'].mean().reset_index()
avg
Out[90]:
Season Purchase Amount (USD)
0 Fall 61.556923
1 Spring 58.737738
2 Summer 58.405236
3 Winter 60.357364
In [96]:
plt.figure(figsize=(8, 8))
explode=[0,0.1,0.2,0.1]
plt.pie(avg['Purchase Amount (USD)'], explode=explode, labels=avg['Season'], autopct='%1.1f%%', startangle=140)
plt.title('Proportion of Average Purchase Amount by Season', fontsize=14)
plt.show()
No description has been provided for this image
In [98]:
sns.lineplot(x='Season', y='Purchase Amount (USD)', data=avg, marker='o')
plt.title('Average Purchase Amount (USD) by Season', fontsize=14)
plt.ylabel('Average Purchase Amount (USD)', fontsize=12)
plt.xlabel('Season', fontsize=12)
plt.grid(alpha=0.5)
plt.show()
C:\Users\Divya\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning:

use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.

C:\Users\Divya\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning:

use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.

No description has been provided for this image

9. Frequency of Purchases:¶

a.How often do customers make purchases (weekly, fortnightly, annually)?

In [104]:
df.head(2)
Out[104]:
Customer ID Age Gender Item Purchased Category Purchase Amount (USD) Location Size Color Season Review Rating Subscription Status Payment Method Shipping Type Discount Applied Promo Code Used Previous Purchases Preferred Payment Method Frequency of Purchases Size_Numeric
0 1 55 Male Blouse Clothing 53 Kentucky L Gray Winter 3.1 Yes Credit Card Express Yes Yes 14 Venmo Fortnightly 3
1 2 19 Male Sweater Clothing 64 Maine L Maroon Winter 3.1 Yes Bank Transfer Express Yes Yes 2 Cash Fortnightly 3
In [116]:
filter_df = df[df['Frequency of Purchases'].isin(['Weekly', 'Fortnightly', 'Annually'])]
counts = filter_df['Frequency of Purchases'].value_counts().reset_index()
counts
Out[116]:
Frequency of Purchases count
0 Annually 572
1 Fortnightly 542
2 Weekly 539
In [129]:
plt.figure(figsize=(10, 6))
sns.barplot(x='Frequency of Purchases', y='count', data=counts, palette='viridis', width=0.5)
plt.title('Frequency of Purchases by Customers', fontsize=14)
plt.ylabel('Number of Customers', fontsize=12)
plt.xlabel('Frequency of Purchases', fontsize=12)
for container in plt.gca().containers:
    plt.bar_label(container, fmt='%.0f', label_type='edge', fontsize=10, color='black')
plt.show()
No description has been provided for this image
In [131]:
sns.lineplot(x='Frequency of Purchases', y='count', data=counts, marker='o')
plt.title('Frequency of Purchases by Customers', fontsize=14)
plt.ylabel('Number of Customers', fontsize=12)
plt.xlabel('Frequency of Purchases', fontsize=12)
plt.grid(alpha=0.5)
plt.show()
C:\Users\Divya\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning:

use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.

C:\Users\Divya\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning:

use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.

No description has been provided for this image

Customer Engagement¶

10. Subscription Status:¶

• How does subscription status affect purchase behavior?

In [136]:
df.groupby('Subscription Status')['Purchase Amount (USD)'].mean()
Out[136]:
Subscription Status
No     59.865121
Yes    59.491928
Name: Purchase Amount (USD), dtype: float64
In [142]:
plt.figure(figsize=(10, 6))
sns.barplot(x='Subscription Status', y='Purchase Amount (USD)', data=df, palette='viridis', width=0.5)
plt.title('Effect of Subscription Status on Purchase Amount', fontsize=14)
plt.xlabel('Subscription Status', fontsize=12)
plt.ylabel('Average Purchase Amount (USD)', fontsize=12)
plt.tight_layout()
for container in plt.gca().containers:
    plt.bar_label(container, fmt='%.0f', fontsize=10, color='black')
plt.show()
No description has been provided for this image
In [141]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='Subscription Status', y='Purchase Amount (USD)', data=df, palette='viridis')
plt.title('Effect of Subscription Status on Purchase Frequency', fontsize=14)
plt.xlabel('Subscription Status', fontsize=12)
plt.ylabel('Frequency of Purchases', fontsize=12)
plt.tight_layout()
plt.show()
No description has been provided for this image

Are subscribed customers spending more than non-subscribed customers

In [147]:
df.head(2)
Out[147]:
Customer ID Age Gender Item Purchased Category Purchase Amount (USD) Location Size Color Season Review Rating Subscription Status Payment Method Shipping Type Discount Applied Promo Code Used Previous Purchases Preferred Payment Method Frequency of Purchases
0 1 55 Male Blouse Clothing 53 Kentucky L Gray Winter 3.1 Yes Credit Card Express Yes Yes 14 Venmo Fortnightly
1 2 19 Male Sweater Clothing 64 Maine L Maroon Winter 3.1 Yes Bank Transfer Express Yes Yes 2 Cash Fortnightly

11. Previous Purchases:¶

• Is there a correlation between the number of previous purchases and the review ratings?

In [148]:
corr = df['Previous Purchases'].corr(df['Review Rating'])
corr
Out[148]:
0.004229099465270933
In [150]:
fig = px.scatter(df, x='Previous Purchases', y='Review Rating', title='Previous Purchases vs Review Rating', labels={'Previous Purchases': 'Previous Purchases', 'Review Rating': 'Review Rating'})
fig.show()
In [154]:
sns.regplot(x='Previous Purchases', y='Review Rating', data=df, scatter=False, color='red')
plt.title('Previous Purchases vs Review Rating', fontsize=16)
plt.xlabel('Previous Purchases', fontsize=12)
plt.ylabel('Review Rating', fontsize=12)
plt.show()
No description has been provided for this image

12. Review Ratings:¶

• What is the distribution of review ratings for products?

In [159]:
df['Review Rating'].value_counts()
Out[159]:
Review Rating
3.4    182
4.0    181
4.6    174
4.2    171
2.9    170
4.9    166
3.9    163
3.0    162
2.6    159
4.4    158
3.1    157
3.7    156
3.5    156
2.7    154
3.3    152
3.2    152
3.6    149
4.7    148
4.1    148
4.3    147
4.8    144
3.8    142
4.5    139
2.8    136
5.0     68
2.5     66
Name: count, dtype: int64
In [160]:
df['Review Rating'].describe()
Out[160]:
count    3900.000000
mean        3.749949
std         0.716223
min         2.500000
25%         3.100000
50%         3.700000
75%         4.400000
max         5.000000
Name: Review Rating, dtype: float64
In [162]:
plt.figure(figsize=(10, 6))
sns.histplot(df['Review Rating'], kde=True, color='blue')
plt.title('Review Rating Distribution for products', fontsize=16)
plt.xlabel('Review Rating', fontsize=12)
plt.ylabel('Counts', fontsize=12)
plt.show()
C:\Users\Divya\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning:

use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.

No description has been provided for this image
In [167]:
plt.figure(figsize=(8, 6))
sns.boxplot(x=df['Review Rating'], color='lightgreen')
plt.title('Review Rating Distribution for Products', fontsize=16)
plt.xlabel('Review Rating', fontsize=12)
plt.show()
No description has been provided for this image
In [168]:
fig = px.box(df, y="Review Rating", 
             title="Review Rating Distribution for Products")
fig.show()

• How do review ratings vary across product categories?

In [5]:
dist = df.groupby('Category')['Review Rating'].describe()
dist
Out[5]:
count mean std min 25% 50% 75% max
Category
Accessories 1240.0 3.768629 0.715317 2.5 3.2 3.8 4.4 5.0
Clothing 1737.0 3.723143 0.717671 2.5 3.1 3.7 4.3 5.0
Footwear 599.0 3.790651 0.719843 2.5 3.2 3.8 4.4 5.0
Outerwear 324.0 3.746914 0.702598 2.5 3.1 3.8 4.3 5.0
In [7]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='Category', y='Review Rating', palette='Set2')
plt.title('Distribution of Review Ratings Across Product Categories', fontsize=16)
plt.xlabel('Product Category', fontsize=12)
plt.ylabel('Review Rating', fontsize=12)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [8]:
import plotly.express as px
fig = px.box(df, x='Category', y='Review Rating', color='Category', 
    title='Distribution of Review Ratings Across Product Categories',
    labels={'Category': 'Product Category', 'Review Rating': 'Review Rating'}, width=800, height=500)
fig.show()
In [10]:
plt.figure(figsize=(10, 6))
sns.lineplot(data=df, x='Category', y='Review Rating', marker='o', color='blue', linewidth=2.5)
plt.title('Average Review Ratings Across Product Categories', fontsize=16)
plt.xlabel('Product Category', fontsize=12)
plt.ylabel('Average Review Rating', fontsize=12)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.tight_layout()
plt.show()
C:\Users\Divya\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning:

use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.

C:\Users\Divya\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning:

use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.

No description has been provided for this image
In [11]:
category_means = df.groupby('Category')['Review Rating'].mean().reset_index()
category_means
Out[11]:
Category Review Rating
0 Accessories 3.768629
1 Clothing 3.723143
2 Footwear 3.790651
3 Outerwear 3.746914
In [20]:
plt.figure(figsize=(7, 4))
sns.lineplot(data=category_means, x='Category', y='Review Rating', marker='o', color='blue', linewidth=2.5)
plt.title('Average Review Ratings Across Product Categories', fontsize=16)
plt.xlabel('Product Category', fontsize=12)
plt.ylabel('Average Review Rating', fontsize=12)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.grid(alpha=0.5)
plt.tight_layout()
plt.show()
C:\Users\Divya\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning:

use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.

C:\Users\Divya\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning:

use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.

No description has been provided for this image

Payment and Discounts¶

13. Payment Method Analysis:¶

• What are the preferred payment methods among customers?

In [16]:
payment_method_counts = df['Preferred Payment Method'].value_counts()
payment_method_counts
Out[16]:
Preferred Payment Method
PayPal           677
Credit Card      671
Cash             670
Debit Card       636
Venmo            634
Bank Transfer    612
Name: count, dtype: int64
In [17]:
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
In [22]:
plt.figure(figsize=(10, 6))
sns.barplot(x=payment_method_counts.index, y=payment_method_counts.values, palette='viridis', width=0.5)
plt.title('Preferred Payment Methods', fontsize=16)
plt.xlabel('Payment Method', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.xticks(rotation=3600)
plt.tight_layout()
for i, value in enumerate(payment_method_counts.values):
    plt.text(i, value + 5, str(value), ha='center', fontsize=10)
plt.show()
No description has been provided for this image
In [23]:
plt.figure(figsize=(8, 8))
plt.pie(payment_method_counts.values,labels=payment_method_counts.index,autopct='%1.1f%%',startangle=140)
plt.title('Preferred Payment Methods', fontsize=16)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [25]:
sns.lineplot(x=payment_method_counts.index,y=payment_method_counts.values,marker='o',linewidth=2,color='b')
plt.title('Preferred Payment Methods (Line Plot)', fontsize=16)
plt.xlabel('Payment Method', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.xticks(rotation=45)
for i, value in enumerate(payment_method_counts.values):
    plt.text(i, value + 5, str(value), ha='center', fontsize=10)
plt.tight_layout()
plt.grid(alpha=0.5)
plt.show()
C:\Users\Divya\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
C:\Users\Divya\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
No description has been provided for this image

Does the preferred payment method differ by age or gender?

In [27]:
preferred_payment_by_age_gender = df.groupby(['Gender', 'Age', 'Preferred Payment Method']).size().unstack()
preferred_payment_by_age_gender
Out[27]:
Preferred Payment Method Bank Transfer Cash Credit Card Debit Card PayPal Venmo
Gender Age
Female 18 3.0 6.0 5.0 NaN 3.0 3.0
19 5.0 4.0 4.0 3.0 7.0 4.0
20 2.0 2.0 4.0 NaN 2.0 2.0
21 3.0 3.0 2.0 4.0 4.0 3.0
22 6.0 7.0 2.0 3.0 9.0 1.0
... ... ... ... ... ... ... ...
Male 66 12.0 7.0 9.0 7.0 9.0 6.0
67 5.0 4.0 10.0 8.0 9.0 4.0
68 10.0 13.0 5.0 8.0 6.0 11.0
69 13.0 11.0 9.0 17.0 5.0 8.0
70 9.0 4.0 9.0 7.0 8.0 8.0

106 rows × 6 columns

In [29]:
plt.figure(figsize=(12, 8))
sns.heatmap(preferred_payment_by_age_gender,linewidths=0.5)
plt.title('Preferred Payment Methods by Age Group and Gender', fontsize=16)
plt.xlabel('Preferred Payment Method', fontsize=14)
plt.ylabel('Gender and Age Group', fontsize=14)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [33]:
melted_data = preferred_payment_by_age_gender.reset_index().melt(id_vars=['Gender', 'Age'], var_name='Preferred Payment Method', value_name='Count')
melted_data
Out[33]:
Gender Age Preferred Payment Method Count
0 Female 18 Bank Transfer 3.0
1 Female 19 Bank Transfer 5.0
2 Female 20 Bank Transfer 2.0
3 Female 21 Bank Transfer 3.0
4 Female 22 Bank Transfer 6.0
... ... ... ... ...
631 Male 66 Venmo 6.0
632 Male 67 Venmo 4.0
633 Male 68 Venmo 11.0
634 Male 69 Venmo 8.0
635 Male 70 Venmo 8.0

636 rows × 4 columns

In [39]:
plt.figure(figsize=(14, 8))
sns.barplot(data=melted_data,x='Preferred Payment Method',y='Count',hue='Gender',palette='viridis')
plt.title('Preferred Payment Methods by Age Group and Gender', fontsize=16)
plt.xlabel('Preferred Payment Method', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.xticks(rotation=45)
plt.legend(title='Gender', fontsize=12)
plt.tight_layout()
for p in plt.gca().patches:
    plt.text(p.get_x() + p.get_width() / 2, p.get_height() + 1, 
             '{:1.0f}'.format(p.get_height()), ha='center', va='center',fontsize=16)
plt.show()
No description has been provided for this image

14. Discounts and Promo Codes:¶

• How often are discounts and promo codes applied?

In [40]:
df.head(2)
Out[40]:
Customer ID Age Gender Item Purchased Category Purchase Amount (USD) Location Size Color Season Review Rating Subscription Status Payment Method Shipping Type Discount Applied Promo Code Used Previous Purchases Preferred Payment Method Frequency of Purchases
0 1 55 Male Blouse Clothing 53 Kentucky L Gray Winter 3.1 Yes Credit Card Express Yes Yes 14 Venmo Fortnightly
1 2 19 Male Sweater Clothing 64 Maine L Maroon Winter 3.1 Yes Bank Transfer Express Yes Yes 2 Cash Fortnightly
In [47]:
promo_code=df['Promo Code Used'].value_counts()
print(promo_code)
promo_code_percentage = df['Promo Code Used'].value_counts(normalize=True) * 100
print(promo_code_percentage)
Promo Code Used
No     2223
Yes    1677
Name: count, dtype: int64
Promo Code Used
No     57.0
Yes    43.0
Name: proportion, dtype: float64
In [48]:
discount_count = df['Discount Applied'].value_counts()
print(discount_count)
discount_percentage = df['Discount Applied'].value_counts(normalize=True) * 100
print(discount_percentage)
Discount Applied
No     2223
Yes    1677
Name: count, dtype: int64
Discount Applied
No     57.0
Yes    43.0
Name: proportion, dtype: float64

Does using a discount or promo code affect the purchase amount?

In [49]:
discount_effect = df.groupby('Discount Applied')['Purchase Amount (USD)'].mean()
promo_code_effect = df.groupby('Promo Code Used')['Purchase Amount (USD)'].mean()
discount_effect, promo_code_effect
Out[49]:
(Discount Applied
 No     60.130454
 Yes    59.279070
 Name: Purchase Amount (USD), dtype: float64,
 Promo Code Used
 No     60.130454
 Yes    59.279070
 Name: Purchase Amount (USD), dtype: float64)
In [56]:
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.barplot(x=discount_effect.index, y=discount_effect.values, palette='viridis',width=0.5)
plt.title('Average Purchase Amount by Discount Applied', fontsize=14)
plt.xlabel('Discount Applied', fontsize=12)
plt.ylabel('Average Purchase Amount (USD)', fontsize=12)
for index, value in enumerate(discount_effect.values):
    plt.text(index, value + 0.5, str(value), color='black', ha='center', fontsize=12)


plt.subplot(1, 2, 2)
sns.barplot(x=promo_code_effect.index, y=promo_code_effect.values, palette='viridis',width=0.5)
plt.title('Average Purchase Amount by Promo Code Used', fontsize=14)
plt.xlabel('Promo Code Used', fontsize=12)
plt.ylabel('Average Purchase Amount (USD)', fontsize=12)
plt.xticks(rotation=90)
for index, value in enumerate(promo_code_effect.values):
    plt.text(index, value + 0.5, str(value), color='black', ha='center', fontsize=12)

plt.tight_layout()
plt.show()
No description has been provided for this image

15. Shipping Types:¶

• What are the most common shipping types chosen by customers?

In [57]:
x=df['Shipping Type'].value_counts()
x
Out[57]:
Shipping Type
Free Shipping     675
Standard          654
Store Pickup      650
Next Day Air      648
Express           646
2-Day Shipping    627
Name: count, dtype: int64
In [59]:
plt.figure(figsize=(10, 6))
sns.barplot(x=x.values, y=x.index, palette='viridis')
plt.title('Most Common Shipping Types Chosen by Customers', fontsize=16)
plt.xlabel('Number of Selections', fontsize=12)
plt.ylabel('Shipping Type', fontsize=12)
for index, value in enumerate(x.values):
    plt.text(value + 5, index, str(value), va='center', fontsize=10)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [62]:
plt.figure(figsize=(8, 8))
x.plot.pie(autopct='%1.1f%%', startangle=140)
plt.title('Distribution of Shipping Types Chosen by Customers', fontsize=14)
plt.ylabel('')
plt.tight_layout()
plt.show()
No description has been provided for this image
In [66]:
plt.figure(figsize=(10, 6))
plt.plot(x.index, x.values, marker='o', linestyle='-', color='purple')
plt.title('Frequency of Shipping Types Chosen by Customers', fontsize=16)
plt.xlabel('Shipping Type', fontsize=12)
plt.ylabel('Number of Selections', fontsize=12)
plt.xticks(rotation=45, fontsize=10) 
plt.grid(visible=True, linestyle='--', alpha=0.7)
for index, value in enumerate(x.values):
    plt.text(index, value + 1.5, str(value), ha='center', fontsize=10, color='black')
plt.tight_layout()
plt.show()
No description has been provided for this image

• Does the purchase amount vary based on the shipping type?

In [69]:
a = df.groupby('Shipping Type')['Purchase Amount (USD)'].mean()
a
Out[69]:
Shipping Type
2-Day Shipping    60.733652
Express           60.475232
Free Shipping     60.410370
Next Day Air      58.631173
Standard          58.460245
Store Pickup      59.893846
Name: Purchase Amount (USD), dtype: float64
In [70]:
plt.figure(figsize=(10, 6))
a.plot(kind='barh', color='skyblue', edgecolor='black')
plt.title('Average Purchase Amount by Shipping Type', fontsize=16)
plt.xlabel('Average Purchase Amount (USD)', fontsize=12)
plt.ylabel('Shipping Type', fontsize=12)
for index, value in enumerate(a.values):
    plt.text(value + 0.2, index, f"${value:.2f}", va='center', fontsize=10)
plt.tight_layout()
plt.show()
No description has been provided for this image

Correlation and Relationships¶

16. Correlation Analysis:¶

• Are there any correlations between variables such as age, purchase amount, review ratings, and previous purchases?

In [8]:
correlation_data = df[['Age', 'Purchase Amount (USD)', 'Review Rating', 'Previous Purchases']]
correlation_matrix = correlation_data.corr()
correlation_matrix
Out[8]:
Age Purchase Amount (USD) Review Rating Previous Purchases
Age 1.000000 -0.010424 -0.021949 0.040445
Purchase Amount (USD) -0.010424 1.000000 0.030776 0.008063
Review Rating -0.021949 0.030776 1.000000 0.004229
Previous Purchases 0.040445 0.008063 0.004229 1.000000
In [9]:
correlation_matrix
Out[9]:
Age Purchase Amount (USD) Review Rating Previous Purchases
Age 1.000000 -0.010424 -0.021949 0.040445
Purchase Amount (USD) -0.010424 1.000000 0.030776 0.008063
Review Rating -0.021949 0.030776 1.000000 0.004229
Previous Purchases 0.040445 0.008063 0.004229 1.000000
In [10]:
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.show()
No description has been provided for this image

17. Season and Frequency Relationship:¶

• Does the frequency of purchases vary across seasons?

In [80]:
x = df['Season'].value_counts()
x
Out[80]:
Season
Spring    999
Fall      975
Winter    971
Summer    955
Name: count, dtype: int64
In [85]:
plt.figure(figsize=(8, 6))
sns.barplot(x=x.index, y=x.values, palette='viridis', edgecolor='black', width=0.5)
plt.title('Frequency of Purchases Across Seasons', fontsize=16)
plt.xlabel('Season', fontsize=12)
plt.ylabel('Number of Purchases', fontsize=12)
for index, value in enumerate(x.values):
    plt.text(index, value + 5, str(value), ha='center', fontsize=10)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [84]:
plt.figure(figsize=(6, 6))
x.plot.pie(autopct='%1.1f%%', startangle=140)
plt.title('Frequency of Purchases Across Seasons', fontsize=14)
plt.ylabel('')
plt.tight_layout()
plt.show()
No description has been provided for this image

18. Category and Payment Methods:¶

• Are certain payment methods more common for specific product categories?

In [11]:
category_payment_counts = df.groupby(['Category', 'Payment Method']).size().unstack()
category_payment_counts
Out[11]:
Payment Method Bank Transfer Cash Credit Card Debit Card PayPal Venmo
Category
Accessories 198 200 245 195 199 203
Clothing 291 281 319 286 274 286
Footwear 98 105 84 91 113 108
Outerwear 45 62 48 61 52 56
In [12]:
category_payment_percentage = category_payment_counts.div(category_payment_counts.sum(axis=1), axis=0) * 100
category_payment_percentage
Out[12]:
Payment Method Bank Transfer Cash Credit Card Debit Card PayPal Venmo
Category
Accessories 15.967742 16.129032 19.758065 15.725806 16.048387 16.370968
Clothing 16.753022 16.177317 18.364997 16.465170 15.774324 16.465170
Footwear 16.360601 17.529215 14.023372 15.191987 18.864775 18.030050
Outerwear 13.888889 19.135802 14.814815 18.827160 16.049383 17.283951
In [13]:
plt.figure(figsize=(6, 6))
sns.heatmap(category_payment_percentage, annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5)
plt.title('Payment Method Preferences by Category (Percentage)', fontsize=16)
plt.xlabel('Payment Method', fontsize=12)
plt.ylabel('Category', fontsize=12)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [14]:
ax=category_payment_counts.plot(kind='bar', figsize=(12, 8), stacked=False, colormap='tab20', edgecolor='black')
plt.title('Payment Method Preferences by Category', fontsize=16)
plt.xlabel('Category', fontsize=12)
plt.ylabel('Number of Purchases', fontsize=12)
plt.legend(title='Payment Method', fontsize=10, title_fontsize=12)
plt.xticks(rotation=45, fontsize=10)
plt.tight_layout()
for container in ax.containers:
    ax.bar_label(container, fmt='%d', label_type='edge', fontsize=9, padding=4)
plt.show()
No description has been provided for this image
In [103]:
import matplotlib.pyplot as plt
plt.figure(figsize=(16, 10))
ax=category_payment_counts.plot(kind='bar', stacked=True, figsize=(16, 10), colormap='tab20')
plt.title('Payment Method Preferences by Category')
plt.xlabel('Category')
plt.ylabel('Number of Purchases')
plt.xticks(rotation=360)
plt.legend(title='Payment Method')
for container in ax.containers:
    ax.bar_label(container, label_type='center', fmt='%d')
plt.show()
<Figure size 1600x1000 with 0 Axes>
No description has been provided for this image

19. Gender vs. Preferences:¶

• Do preferences for size, color, or category differ by gender?

In [9]:
size_counts = df.groupby(['Gender', 'Size']).size().unstack()
size_counts
Out[9]:
Size L M S XL
Gender
Female 337 590 187 134
Male 716 1165 476 295
In [10]:
size_counts_percent = size_counts.div(size_counts.sum(axis=1), axis=0) * 100
size_counts_percent
Out[10]:
Size L M S XL
Gender
Female 27.003205 47.275641 14.983974 10.737179
Male 26.998492 43.929110 17.948718 11.123680
In [17]:
size_melted = size_counts.reset_index().melt(id_vars='Gender', var_name='Size', value_name='Count')
plt.figure(figsize=(10, 6))
sns.barplot(data=size_melted, x='Gender', y='Count', hue='Size', palette='viridis', edgecolor='black')
plt.title('Size Preferences by Gender', fontsize=16)
plt.xlabel('Gender', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.legend(title='Size', fontsize=10, title_fontsize=12)
plt.tight_layout()
for p in plt.gca().patches:
    plt.text(p.get_x() + p.get_width() / 2, p.get_height() + 15, 
             '{:1.0f}'.format(p.get_height()), ha='center', va='center',fontsize=10)
plt.show()
No description has been provided for this image
In [20]:
color_counts = df.groupby(['Gender', 'Color']).size().unstack()
color_counts
Out[20]:
Color Beige Black Blue Brown Charcoal Cyan Gold Gray Green Indigo ... Peach Pink Purple Red Silver Teal Turquoise Violet White Yellow
Gender
Female 40 54 52 46 50 47 41 53 56 45 ... 48 58 42 44 52 53 40 53 45 60
Male 107 113 100 95 103 119 97 106 113 102 ... 101 95 109 104 121 119 105 113 97 114

2 rows × 25 columns

In [22]:
color_percent = color_counts.div(color_counts.sum(axis=1), axis=0) * 100
color_percent
Out[22]:
Color Beige Black Blue Brown Charcoal Cyan Gold Gray Green Indigo ... Peach Pink Purple Red Silver Teal Turquoise Violet White Yellow
Gender
Female 3.205128 4.326923 4.166667 3.685897 4.006410 3.766026 3.285256 4.246795 4.487179 3.605769 ... 3.846154 4.647436 3.365385 3.525641 4.166667 4.246795 3.205128 4.246795 3.605769 4.807692
Male 4.034691 4.260935 3.770739 3.582202 3.883861 4.487179 3.657617 3.996983 4.260935 3.846154 ... 3.808446 3.582202 4.110106 3.921569 4.562594 4.487179 3.959276 4.260935 3.657617 4.298643

2 rows × 25 columns

In [34]:
color_melted = color_counts.reset_index().melt(id_vars='Gender', var_name='Color', value_name='Count')
plt.figure(figsize=(16, 10))
sns.barplot(data=color_melted, x='Gender', y='Count', hue='Color', palette='viridis', edgecolor='black')
plt.title('Color Preferences by Gender', fontsize=16)
plt.xlabel('Color', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.legend(title='Color', bbox_to_anchor=(0,1), fontsize=10, title_fontsize=12)
plt.tight_layout()
for p in plt.gca().patches:
    plt.text(p.get_x() + p.get_width() / 2, p.get_height() + 1, 
             '{:1.0f}'.format(p.get_height()), ha='center', va='center',fontsize=10)
plt.show()
No description has been provided for this image
In [35]:
category_counts = df.groupby(['Gender', 'Category']).size().unstack()
category_counts
Out[35]:
Category Accessories Clothing Footwear Outerwear
Gender
Female 392 556 199 101
Male 848 1181 400 223
In [36]:
category_percent = category_counts.div(category_counts.sum(axis=1), axis=0) * 100
category_percent
Out[36]:
Category Accessories Clothing Footwear Outerwear
Gender
Female 31.410256 44.551282 15.945513 8.092949
Male 31.975867 44.532428 15.082956 8.408748
In [40]:
category_melted = category_counts.reset_index().melt(id_vars='Gender', var_name='Category', value_name='Count')
plt.figure(figsize=(12, 8))
barplot = sns.barplot(data=category_melted, x='Gender', y='Count', hue='Category', palette='viridis')
plt.title('Category Preferences by Gender', fontsize=16)
plt.xlabel('Gender', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.legend(title='Category', fontsize=10, title_fontsize=12)
plt.tight_layout()
for p in plt.gca().patches:
    plt.text(p.get_x() + p.get_width() / 2, p.get_height() + 10, 
             '{:1.0f}'.format(p.get_height()), ha='center', va='center',fontsize=14)
plt.show()
No description has been provided for this image

Advanced Visualizations¶

20. Heatmaps:¶

• Create a correlation heatmap to show relationships between numeric variables (e.g., age, purchase amount, previous purchases, review rating).

In [15]:
X = ['Age', 'Purchase Amount (USD)', 'Previous Purchases', 'Review Rating']
data = df[X]
data
Out[15]:
Age Purchase Amount (USD) Previous Purchases Review Rating
0 55 53 14 3.1
1 19 64 2 3.1
2 50 73 23 3.1
3 21 90 49 3.5
4 45 49 31 2.7
... ... ... ... ...
3895 40 28 32 4.2
3896 52 49 41 4.5
3897 46 33 24 2.9
3898 44 77 24 3.8
3899 52 81 33 3.1

3900 rows × 4 columns

In [16]:
correlation_matrix = data.corr()
correlation_matrix
Out[16]:
Age Purchase Amount (USD) Previous Purchases Review Rating
Age 1.000000 -0.010424 0.040445 -0.021949
Purchase Amount (USD) -0.010424 1.000000 0.008063 0.030776
Previous Purchases 0.040445 0.008063 1.000000 0.004229
Review Rating -0.021949 0.030776 0.004229 1.000000
In [17]:
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap of Numeric Variables', fontsize=16)
plt.xticks(fontsize=12, rotation=45)
plt.yticks(fontsize=12, rotation=0)
plt.tight_layout()
plt.show()
No description has been provided for this image

21. Pairplots:¶

• Use pairplots to explore pairwise relationships between numeric variables.

In [50]:
sns.pairplot(df, vars=data, hue='Gender', diag_kind='kde', palette='Set2')
plt.suptitle('Pairplot of Numeric Variables by Gender', y=1.02, fontsize=16)
plt.show()
C:\Users\Divya\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
C:\Users\Divya\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
C:\Users\Divya\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
C:\Users\Divya\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
No description has been provided for this image

22. Category and Purchase Amount¶

Analysis: • Visualize the distribution of purchase amounts for each category using boxplots or violin plots.

In [60]:
plt.figure(figsize=(12, 6))
sns.boxplot(data=df, x='Category', y='Purchase Amount (USD)', palette='Set2')
plt.title('Distribution of Purchase Amounts by Category', fontsize=16)
plt.xlabel('Category', fontsize=12)
plt.ylabel('Purchase Amount', fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [56]:
medians = df.groupby('Category')['Purchase Amount (USD)'].median()
medians
Out[56]:
Category
Accessories    60.0
Clothing       60.0
Footwear       60.0
Outerwear      54.5
Name: Purchase Amount (USD), dtype: float64
In [58]:
plt.figure(figsize=(12, 6))
sns.violinplot(data=df, x='Category', y='Purchase Amount (USD)', palette='Set2', inner='quartile')
plt.title('Distribution of Purchase Amounts by Category (Violin Plot)', fontsize=16)
plt.xlabel('Category', fontsize=12)
plt.ylabel('Purchase Amount', fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
for index, median in enumerate(medians):
    plt.text(index, median, f'{median:.2f}', ha='center', va='center', fontsize=10)
plt.show()
No description has been provided for this image
In [ ]: